/**
* Copyright 2011 Applied Research in Patacriticism and the University of Virginia
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
**/
package org.nines;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.io.IOUtils;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
import org.openrdf.model.Statement;
import org.openrdf.rio.ParseErrorListener;
import org.openrdf.rio.RDFHandler;
import org.openrdf.rio.RDFHandlerException;
import org.openrdf.rio.RDFParseException;
import org.openrdf.rio.rdfxml.RDFXMLParser;
/**
* RDF document parser that only handles full text fields. It will
* spider out to external sites, scrape text and write it to the
* solr raw text directory
*
* @author loufoster
*
*/
final class RdfTextSpider implements RDFHandler {
private ErrorReport errorReport;
private RDFIndexerConfig config;
private HttpClient httpClient;
public RdfTextSpider(RDFIndexerConfig config, ErrorReport errorReport) {
this.config = config;
this.errorReport = errorReport;
this.httpClient = new HttpClient();
}
/**
* Parse the RDF file for the text field. Spider the URL specifed and
* write text from this site to the raw text files.
*
* @param file
*/
public void spider( final File file ) {
RDFXMLParser parser = new RDFXMLParser();
parser.setRDFHandler( this );
parser.setParseErrorListener( new ParseListener(file, errorReport));
parser.setVerifyData(true);
parser.setStopAtFirstError(false);
try {
InputStreamReader is = new InputStreamReader(new FileInputStream(file) );
parser.parse( is, "http://foo/" + file.getName());
} catch (RDFParseException e) {
errorReport.addError(new IndexerError(file.getName(), "", "Parse Error on Line " + e.getLineNumber() + ": "
+ e.getMessage()));
} catch (RDFHandlerException e) {
errorReport.addError(new IndexerError(file.getName(), "", "StatementHandler Exception: " + e.getMessage()));
} catch (Exception e) {
errorReport.addError(new IndexerError(file.getName(), "", "RDF Parser Error: " + e.getMessage()));
e.printStackTrace();
}
}
/**
* Handle RDF statements. This only cares about TEXT statements and will scrape
* text from the URL specified.
*/
public void handleStatement(Statement statement) throws RDFHandlerException {
String predicate = statement.getPredicate().stringValue().trim();
String object = statement.getObject().stringValue().trim();
// if the object of the triple is blank, skip it, it is nothing worth indexing
if (object == null || object.length() == 0) {
return;
}
// only care about TEXT
if ("http://www.collex.org/schema#text".equals(predicate) == false ) {
return;
}
// only care if it looks like a URL and is not a PDF
if (object.startsWith("http://") ) {
getRawText(object);
}
}
/**
* Get the full text from an external site an write it untouched to the
* rawtext area of the solr sources. If any errors occur,leave any
* prior versions of the rawtext untouched, log the errors and return
* @param urlString
* @return
*/
private void getRawText(String urlString) {
String rawFile = urlString.replaceAll("/", "SL");;
rawFile = rawFile.replace(":", "CL");
rawFile = rawFile.replace("?", "QU");
rawFile = rawFile.replace("=", "EQ");
rawFile = rawFile.replace("&", "AMP");
rawFile = rawFile + ".txt";
String rawRoot = findRawTextRoot();
rawRoot += RDFIndexerConfig.safeArchive( this.config.archiveName );
File urlFile = new File(rawRoot + "/"+ rawFile );
// scrape the content from remote host...
byte[] bytes = null;
try {
if (urlString.endsWith(".pdf") || urlString.endsWith(".PDF")) {
bytes = scrapeExternalPDF(urlString);;
} else {
bytes = scrapeExternalText(urlString);
}
} catch (IOException e) {
this.errorReport.addError(
new IndexerError( "", urlString, "Unable to create get external text: "+e.toString()));
return;
}
// At this point, we have new data. Delete the old - this does
// nothing if the file does not yet exist
urlFile.delete();
// Make sure that the directory structure leadign up
// to the detination file exists
if ( urlFile.getParentFile().exists() == false) {
if ( urlFile.getParentFile().mkdirs() == false ) {
this.errorReport.addError(
new IndexerError(urlFile.toString(), urlString, "Unable to create raw text file"));
return;
}
}
// dump the content to the file
try {
IOUtils.write(bytes, new FileOutputStream(urlFile));
} catch (IOException e) {
this.errorReport.addError(
new IndexerError(urlFile.toString(), urlString, "Unable to create get external text: "+e.toString()));
}
}
/**
* find the full path to the raw text root baseed on
* the full path to the original rdf sources
* @return
*/
private String findRawTextRoot() {
String path = this.config.sourceDir.toString();
int pos = path.indexOf("/rdf/");
path = path.substring(0, pos) + "/rawtext/";
return path;
}
/**
* Extract the text from the specified URI
* @param url
* @return
*/
private byte[] scrapeExternalText(final String url) throws IOException {
GetMethod get = new GetMethod(url);
int result;
try {
result = this.httpClient.executeMethod(get);
if (result != 200) {
throw new IOException(result + " code returned for URL: " + url);
}
return IOUtils.toByteArray( get.getResponseBodyAsStream() );
} catch (IOException e ) {
throw e; // just rethrow it
} finally {
get.releaseConnection();
}
}
/**
* Extract the text from the PDF specified by the URI
* @param uri
* @return
* @throws IOException
*/
private byte[] scrapeExternalPDF( final String uri ) throws IOException {
InputStream is = null;
GetMethod get = new GetMethod(uri);;
PDDocument pdfDoc = null;
try {
int result;
result = httpClient.executeMethod(get);
if (result != 200) {
throw new IOException(result + " code returned for URL: " + uri);
}
is = get.getResponseBodyAsStream();
pdfDoc = PDDocument.load(is);
PDFTextStripper pdfStrip = new PDFTextStripper();
return pdfStrip.getText( pdfDoc ).getBytes();
} catch (IOException e ) {
throw e; // just rethrow it
} finally {
try{
get.releaseConnection();
IOUtils.closeQuietly(is);
if ( pdfDoc != null ) {
pdfDoc.close();
}
} catch (Exception e) {}
}
}
public void startRDF() throws RDFHandlerException {
// NO-OP
}
public void endRDF() throws RDFHandlerException {
// NO-OP
}
public void handleNamespace(String prefix, String uri) throws RDFHandlerException {
// NO-OP
}
public void handleComment(String comment) throws RDFHandlerException {
// NO-OP
}
/**
* Listen for parse errors and write them to the error report
* @author loufoster
*
*/
private static final class ParseListener implements ParseErrorListener {
private ErrorReport errorReport;
private File file;
ParseListener(File file, ErrorReport errorReport ) {
this.errorReport = errorReport;
this.file = file;
}
public void warning(String msg, int lineNo, int colNo) {
this.errorReport.addError(new IndexerError(file.getName(), "",
"Parse warning at line "+lineNo+", col "+colNo+" : " + msg));
}
public void error(String msg, int lineNo, int colNo) {
this.errorReport.addError(new IndexerError(file.getName(), "",
"Parse error at line "+lineNo+", col "+colNo+" : " + msg));
}
public void fatalError(String msg, int lineNo, int colNo) {
this.errorReport.addError(new IndexerError(file.getName(), "",
"FATAL PARSE ERROR at line "+lineNo+", col "+colNo+" : " + msg));
}
}
}